package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class ZhongGuoshangbao extends BreadthCrawler {
    private String seedurl = "http://www.zgsb-cn.com/bclass.asp?cid=81";

    public ZhongGuoshangbao(String crawlPath) {
        super(crawlPath, false);
        addSeed(new CrawlDatum(seedurl, "list"));
        addRegex(".*");
        setThreads(1);
        /*
        http://www.zgsb-cn.com/news.asp?id=903
        http://www.zgsb-cn.com/news.asp?id=865
        http://www.zgsb-cn.com/news.asp?id=819
         */
        //setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        String contenType = page.contentType();
        if (contenType == null) {
            return;
        }
        //数据怕的不全
        if (page.matchType("text")) {
            Elements elements = page.select("table");
            int len = elements.size();
            int len1;
            if (len > 3) {
                len1 = len - 3;
                System.out.println("标题为" + elements.get(len1).select("tbody>tr").first().text());
                String str1 = elements.get(len1).select("tbody>tr").get(1).text();
                System.out.println("时间为" + str1.substring(str1.indexOf("时间") + 2, str1.indexOf("本文")));
                System.out.println("阅读量" + str1.substring(str1.indexOf("击") + 1, str1.indexOf("次")));
                String str2 = elements.get(len1).select("tbody>tr").get(3).text();
                String strwri = str2.substring(0, 20);
                System.out.println("strwri:"+strwri);
                if (strwri.contains("中国商报/中国商网")&&strwri.contains("记者")) {
                    //可能报错,因为文章里面作者后面可能 是中文的或者英文的小括号
                    if (strwri.indexOf("文/图")>0){
                       strwri.replace("文/图","）");
                    }
                    if (!strwri.contains(")") || !strwri.contains("）")){
                        strwri = strwri+"）";
                    }
                    String writer = strwri.substring(strwri.indexOf("记者") + 3, strwri.contains("）") ? strwri.indexOf("）") : strwri.indexOf(")"));
                    writer = writer.contains(" ") ? writer.substring(0, writer.indexOf(" ")) : writer;
                    System.out.println("作者为:" + writer);
                } else {
                    System.out.println("作者为:中国商报");
                }
                System.out.println("来源:中国商报");
                System.out.println("正文为" + str2);
            }
        } else if (page.matchType("list")) {
//        }else if (page.matchUrl(page.url()) && page.matchType("list")) {
            Elements elements = page.select("table").get(26).select("a");
            for (Element element : elements) {
                String url = element.attr("abs:href");
                if (url.matches("http://www.zgsb-cn.com/news.asp.*")){
                    crawlDatums.add(new CrawlDatum(url, "text"));
                }
            }
        }
    }

     public static void main(String[] args) throws Exception {
        ZhongGuoshangbao zgs = new ZhongGuoshangbao("zhuana");
         zgs.addRegex(".*");
         zgs.start(2);
    }
}
